{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Mega Parse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Started parsing the file under job_id e5e0367d-2f83-4e4d-84e5-4d5df7119516\n",
      "Started parsing the file under job_id 0b5d66aa-bbab-454b-b256-82495d20f91f\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']\n",
      "- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "from pathlib import Path\n",
    "import sys\n",
    "sys.path.append('..')\n",
    "from megaparse.Converter import MegaParse\n",
    "import os \n",
    "\n",
    "api_key: str | None = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n",
    "\n",
    "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\", llama_parse_api_key=api_key)\n",
    "md_content = converter.convert()\n",
    "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md\"))\n",
    "\n",
    "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\", llama_parse_api_key=api_key)\n",
    "md_content = converter.convert(gpt4o_cleaner = True)\n",
    "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse_gptcleaner.md\"))\n",
    "\n",
    "\n",
    "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\")\n",
    "md_content = converter.convert()\n",
    "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md\"))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### LLama Parse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Started parsing the file under job_id f78ee794-ffde-4e0a-938d-987f1b22cfcb\n"
     ]
    }
   ],
   "source": [
    "from typing import List\n",
    "from llama_index.core.schema import Document\n",
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()\n",
    "#GET LLAMA_CLOUD_API_KEY\n",
    "import os\n",
    "from llama_parse import LlamaParse\n",
    "from llama_parse.utils import ResultType, Language\n",
    "\n",
    "api_key: str | None = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n",
    "\n",
    "parsing_instructions = \"Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables.\"\n",
    "\n",
    "parser = LlamaParse(\n",
    "    api_key=str(api_key), \n",
    "    result_type=ResultType.MD,\n",
    "    gpt4o_mode=True,\n",
    "    verbose=True,\n",
    "    language=Language.FRENCH,\n",
    "    parsing_instruction=parsing_instructions,  # Optionally you can define a parsing instruction\n",
    ")\n",
    "# sync\n",
    "documents: List[Document] = parser.load_data(\"../megaparse/tests/input_tests/MegaFake_report.pdf\")\n",
    "\n",
    "with open(\"../megaparse/tests/output_tests/MegaFake_report_llama.md\", \"w\") as f:\n",
    "        f.write(documents[0].get_content())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Unstructured"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import UnstructuredPDFLoader\n",
    "loader = UnstructuredPDFLoader(\"../megaparse/tests/input_tests/MegaFake_report.pdf\", strategy=\"hi_res\", infer_table_structure=True,\n",
    ")\n",
    "data = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"../megaparse/tests/output_tests/MegaFake_report_unstructured.md\", \"w\") as f:\n",
    "        f.write(data[0].page_content)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluation with Diff Lib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import difflib\n",
    "\n",
    "def read_file(file_path):\n",
    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
    "        return file.readlines()\n",
    "\n",
    "def compare_files(source_path, target_path):\n",
    "    source_lines = read_file(source_path)\n",
    "    target_lines = read_file(target_path)\n",
    "\n",
    "    diff = difflib.unified_diff(\n",
    "    source_lines,\n",
    "    target_lines,\n",
    "    fromfile='target.md',\n",
    "    tofile='generated.md',\n",
    "    lineterm=''\n",
    "    )\n",
    "\n",
    "    modifications = 0\n",
    "    for line in diff:\n",
    "        #print(line)\n",
    "        if line.startswith('+') and not line.startswith('+++'):\n",
    "            modifications += 1\n",
    "        elif line.startswith('-') and not line.startswith('---'):\n",
    "            modifications += 1\n",
    "\n",
    "    return modifications\n",
    "    \n",
    "diff_megaparse_unstructured = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n",
    "diff_megaparse_llama_gptcleaner = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse_gptcleaner.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n",
    "diff_megaparse_llama = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n",
    "diff_llamaparse = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n",
    "diff_unstructured = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_unstructured.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "diff_results = {\n",
    "    \"**Megaparse**\": diff_megaparse_unstructured,\n",
    "    \"Megaparse with LLamaParse\": diff_megaparse_llama,\n",
    "    \"Megaparse with LLamaParse and GPTCleaner\": diff_megaparse_llama_gptcleaner,\n",
    "    \"LLama Parse\": diff_llamaparse\n",
    "}\n",
    "\n",
    "# Sort the results\n",
    "sorted_diff_results = sorted(diff_results.items(), key=lambda x: x[1])\n",
    "\n",
    "# Generate a table with the results\n",
    "benchmark_results = \"| Parser | Diff |\\n|---|---|\\n\"\n",
    "for parser, diff in sorted_diff_results:\n",
    "    benchmark_results += f\"| {parser} | {diff} |\\n\"\n",
    "\n",
    "# Update README.md file\n",
    "with open(\"../README.md\", \"r\") as readme_file:\n",
    "    readme_content = readme_file.read()\n",
    "\n",
    "start_marker = \"<!---BENCHMARK-->\"\n",
    "end_marker = \"<!---END_BENCHMARK-->\"\n",
    "start_index = readme_content.find(start_marker) + len(start_marker)\n",
    "end_index = readme_content.find(end_marker)\n",
    "\n",
    "updated_readme_content = readme_content[:start_index] + \"\\n\" + benchmark_results + readme_content[end_index:]\n",
    "\n",
    "with open(\"../README.md\", \"w\") as readme_file:\n",
    "    readme_file.write(updated_readme_content)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "QuivrParse-DS8JDGq8",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
